Dataset Preprocessing¶

Library loading¶

In [1]:
import scanpy as sc, anndata as ad, numpy as np, pandas as pd
from scipy import sparse
from anndata import AnnData
import warnings
import yaml
import os
import warnings
import socket
import matplotlib as plt


warnings.filterwarnings('ignore')
In [2]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
with open("../data/resources/rcParams.yaml") as f:
    rcParamsDict = yaml.full_load(f)
    for k in rcParamsDict["rcParams"]:
        print("{} {}".format(k,rcParamsDict["rcParams"][k]))
        plt.rcParams[k] = rcParamsDict["rcParams"][k]
    for k1 in set(list(rcParamsDict)).difference(set(["rcParams"])):
        print("{} {}".format(k1,rcParamsDict[k1]))
scanpy==1.8.0 anndata==0.8.0 umap==0.4.6 numpy==1.22.2 scipy==1.6.2 pandas==1.2.3 scikit-learn==0.24.1 statsmodels==0.13.5 python-igraph==0.9.1 louvain==0.7.0 leidenalg==0.8.3
figure.dpi 50
savefig.dpi 500
figure.figsize [10, 10]
axes.facecolor white
dotSize 20
In [3]:
DSname="UpD300"
DSDirName="Sample_S33846_C_GEX"

Configure paths¶

In [4]:
outdir = "../data/output"
if not os.path.exists(outdir):
   # Create a new directory because it does not exist
   os.makedirs(outdir)
    
if not os.path.exists("../data/output/adatas"):
   # Create a new directory because it does not exist
   os.makedirs("../data/output/adatas")

with open("../data/resources/iPSC_lines_map.yaml", 'r') as f:
    iPSC_lines_map = yaml.load(f, Loader=yaml.FullLoader)["lines"]
results_file = outdir+'/'+DSname+'.h5ad'
In [5]:
scanpyObj = sc.read_10x_mtx('../data/'+DSDirName+'/filtered_feature_bc_matrix/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading

scanpyObj.obs['dataset'] = DSname
scanpyObj.obs_names = [i + "_" + j for i, j in zip(scanpyObj.obs_names.tolist(), scanpyObj.obs["dataset"].tolist())]
... reading from cache file cache/..-data-Sample_S33846_C_GEX-filtered_feature_bc_matrix-matrix.h5ad
In [6]:
scanpyObj.var_names_make_unique()
scanpyObj.obs_names_make_unique()

Importing cellIDs¶

In [7]:
cellID = pd.read_csv('../data/'+DSDirName+'/aggregatedCall/aggregatedCall.tsv', sep ="\t",index_col = 0)
cellID
cellID.index = [i + "_" + DSname for i in cellID.index.tolist()]
In [8]:
cellID.Consensus.unique()
Out[8]:
array(['MIFF1', '809', 'KOLF', 'doublet', 'LowQuality'], dtype=object)
In [9]:
scanpyObj.obs['cellID'] = cellID.loc[scanpyObj.obs_names, 'Consensus']
In [10]:
scanpyObj.obs
Out[10]:
dataset cellID
AAACCCACAGCCTTCT-1_UpD300 UpD300 MIFF1
AAACCCAGTCCAGTTA-1_UpD300 UpD300 MIFF1
AAACCCAGTCTGCATA-1_UpD300 UpD300 doublet
AAACCCAGTCTTACAG-1_UpD300 UpD300 MIFF1
AAACCCAGTTCTCGCT-1_UpD300 UpD300 MIFF1
... ... ...
TTTGTTGAGTATGACA-1_UpD300 UpD300 MIFF1
TTTGTTGCATAGATCC-1_UpD300 UpD300 doublet
TTTGTTGCATCAGTGT-1_UpD300 UpD300 MIFF1
TTTGTTGTCATTGGTG-1_UpD300 UpD300 MIFF1
TTTGTTGTCTCGGCTT-1_UpD300 UpD300 MIFF1

9929 rows × 2 columns

Configure colors¶

In [11]:
cellID_colors = {}
cellID_newName_colors = {}
cellID_newNames = {}


for line in iPSC_lines_map.keys():
    cellID_colors[iPSC_lines_map[line]["oldName"]] = iPSC_lines_map[line]["color"]
    cellID_newName_colors[iPSC_lines_map[line]["newName"]] = iPSC_lines_map[line]["color"]
    cellID_newNames[iPSC_lines_map[line]["oldName"]] = iPSC_lines_map[line]["newName"]

scanpyObj.obs["cellID"] = scanpyObj.obs["cellID"].astype("category")
scanpyObj.obs["cellID_newName"] = scanpyObj.obs["cellID"].replace(cellID_newNames, inplace=False).astype("category")
scanpyObj.uns["cellID_colors"] = [cellID_colors[line] for line in scanpyObj.obs["cellID"].cat.categories]
scanpyObj.uns["cellID_newName_colors"] = [cellID_newName_colors[line] for line in scanpyObj.obs["cellID_newName"].cat.categories]
In [12]:
[cellID_colors[line] for line in scanpyObj.obs["cellID"].cat.categories]
Out[12]:
['#FF0054', '#0FB248', '#a8a5a5', '#7B00FF', '#403b3b']

Preprocessing¶

Show those genes that yield the highest fraction of counts in each single cells, across all cells.

In [13]:
sc.pl.highest_expr_genes(scanpyObj, n_top=20 )
normalizing counts per cell
    finished (0:00:00)

Subsetting according to good barcodes¶

In [14]:
goodBarcodes=pd.read_csv(outdir+'/'+DSname+'_filteredCells.tsv', sep="\t")["BCs"]+"_"+DSname
scanpyObj = scanpyObj[goodBarcodes,:]

QC metrices¶

In [15]:
scanpyObj.var['mt'] = scanpyObj.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(scanpyObj, qc_vars=['mt'], percent_top=None, log1p=True, inplace=True)

scanpyObj.var['ribo'] = scanpyObj.var_names.str.startswith('RP')  # annotate the group of ribosomal genes as 'ribo'
sc.pp.calculate_qc_metrics(scanpyObj, qc_vars=['ribo'], percent_top=None, log1p=True, inplace=True)
In [16]:
scanpyObj
Out[16]:
AnnData object with n_obs × n_vars = 6240 × 33538
    obs: 'dataset', 'cellID', 'cellID_newName', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo'
    var: 'gene_ids', 'feature_types', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'ribo'
    uns: 'cellID_colors', 'cellID_newName_colors'
In [17]:
scanpyObj.var
Out[17]:
gene_ids feature_types mt n_cells_by_counts mean_counts log1p_mean_counts pct_dropout_by_counts total_counts log1p_total_counts ribo
MIR1302-2HG ENSG00000243485 Gene Expression False 0 0.000000 0.000000 100.000000 0.0 0.000000 False
FAM138A ENSG00000237613 Gene Expression False 0 0.000000 0.000000 100.000000 0.0 0.000000 False
OR4F5 ENSG00000186092 Gene Expression False 0 0.000000 0.000000 100.000000 0.0 0.000000 False
AL627309.1 ENSG00000238009 Gene Expression False 28 0.004487 0.004477 99.551282 28.0 3.367296 False
AL627309.3 ENSG00000239945 Gene Expression False 4 0.000641 0.000641 99.935897 4.0 1.609438 False
... ... ... ... ... ... ... ... ... ... ...
AC233755.2 ENSG00000277856 Gene Expression False 0 0.000000 0.000000 100.000000 0.0 0.000000 False
AC233755.1 ENSG00000275063 Gene Expression False 0 0.000000 0.000000 100.000000 0.0 0.000000 False
AC240274.1 ENSG00000271254 Gene Expression False 780 0.149840 0.139623 87.500000 935.0 6.841616 False
AC213203.1 ENSG00000277475 Gene Expression False 0 0.000000 0.000000 100.000000 0.0 0.000000 False
FAM231C ENSG00000268674 Gene Expression False 0 0.000000 0.000000 100.000000 0.0 0.000000 False

33538 rows × 10 columns

In [18]:
scanpyObj.obs
Out[18]:
dataset cellID cellID_newName n_genes_by_counts log1p_n_genes_by_counts total_counts log1p_total_counts total_counts_mt log1p_total_counts_mt pct_counts_mt total_counts_ribo log1p_total_counts_ribo pct_counts_ribo
AAACCCAGTCTTACAG-1_UpD300 UpD300 MIFF1 CTL02A 4228 8.349721 13926.0 9.541585 764.0 6.639876 5.486141 2127.0 7.662938 15.273589
AAACCCAGTTCTCGCT-1_UpD300 UpD300 MIFF1 CTL02A 4274 8.360539 14950.0 9.612534 760.0 6.634634 5.083612 1969.0 7.585789 13.170568
AAACCCATCAACCTTT-1_UpD300 UpD300 MIFF1 CTL02A 3618 8.193953 11629.0 9.361343 738.0 6.605298 6.346203 1805.0 7.498870 15.521542
AAACCCATCCCGTGTT-1_UpD300 UpD300 MIFF1 CTL02A 1820 7.507141 3297.0 8.101071 112.0 4.727388 3.397028 152.0 5.030438 4.610251
AAACCCATCGCTAATG-1_UpD300 UpD300 MIFF1 CTL02A 2992 8.004032 9350.0 9.143239 623.0 6.436151 6.663102 1482.0 7.301822 15.850266
... ... ... ... ... ... ... ... ... ... ... ... ... ...
TTTGTTGAGCGATCGA-1_UpD300 UpD300 MIFF1 CTL02A 2548 7.843456 5465.0 8.606302 51.0 3.951244 0.933211 539.0 6.291569 9.862762
TTTGTTGAGCTGTACT-1_UpD300 UpD300 MIFF1 CTL02A 3514 8.164795 8823.0 9.085231 210.0 5.351858 2.380143 1118.0 7.020191 12.671428
TTTGTTGAGTATGACA-1_UpD300 UpD300 MIFF1 CTL02A 2850 7.955425 7763.0 8.957253 277.0 5.627621 3.568208 1417.0 7.257003 18.253252
TTTGTTGCATCAGTGT-1_UpD300 UpD300 MIFF1 CTL02A 3891 8.266678 13577.0 9.516206 305.0 5.723585 2.246446 2587.0 7.858641 19.054283
TTTGTTGTCTCGGCTT-1_UpD300 UpD300 MIFF1 CTL02A 4494 8.410721 14987.0 9.615005 640.0 6.463029 4.270368 1881.0 7.540091 12.550877

6240 rows × 13 columns

In [19]:
sc.pl.violin(scanpyObj, ['n_genes_by_counts', 'total_counts'],
             groupby= "cellID", jitter=0.4, multi_panel=True, rotation=90)

sc.pl.violin(scanpyObj, ['total_counts_mt','total_counts_ribo'],
             groupby= "cellID", jitter=0.4, multi_panel=True, rotation=90)
In [20]:
sc.pl.violin(scanpyObj, ['log1p_n_genes_by_counts', 'log1p_total_counts'],
             groupby= "cellID", jitter=0.4, multi_panel=True, rotation=90)

sc.pl.violin(scanpyObj, ['log1p_total_counts_mt','log1p_total_counts_ribo'],
             groupby= "cellID", jitter=0.4, multi_panel=True, rotation=90)
In [21]:
sc.pl.scatter(scanpyObj, x='log1p_total_counts', y='log1p_n_genes_by_counts')
sc.pl.scatter(scanpyObj, x='log1p_total_counts', y='log1p_total_counts_mt')
sc.pl.scatter(scanpyObj, x='log1p_total_counts', y='log1p_total_counts_ribo')
In [22]:
sc.pl.violin(scanpyObj, ['log1p_n_genes_by_counts', 'log1p_total_counts'],
             groupby= "cellID", jitter=0.4, multi_panel=True, rotation=90)

sc.pl.violin(scanpyObj, ['log1p_total_counts_mt','log1p_total_counts_ribo'],
             groupby= "cellID", jitter=0.4, multi_panel=True, rotation=90)
In [23]:
sc.pl.scatter(scanpyObj, x='total_counts', y='log1p_n_genes_by_counts')
sc.pl.scatter(scanpyObj, x='total_counts', y='log1p_total_counts_mt')
sc.pl.scatter(scanpyObj, x='total_counts', y='log1p_total_counts_ribo')
In [24]:
sc.pl.violin(scanpyObj, ['log1p_n_genes_by_counts', 'log1p_total_counts'],
             groupby= "cellID", jitter=0.4, multi_panel=True)

sc.pl.violin(scanpyObj, ['log1p_total_counts_mt','log1p_total_counts_ribo'],
             groupby= "cellID", jitter=0.4, multi_panel=True)
In [25]:
sc.pl.scatter(scanpyObj, x='total_counts', y='log1p_n_genes_by_counts')
sc.pl.scatter(scanpyObj, x='total_counts', y='log1p_total_counts_mt')
sc.pl.scatter(scanpyObj, x='total_counts', y='log1p_total_counts_ribo')
In [26]:
scanpyObj.write_h5ad(outdir+'/adatas/'+DSname+'_raw.h5ad')
In [27]:
sc.pp.normalize_total(scanpyObj, exclude_highly_expressed=True, max_fraction=.1)
normalizing counts per cell The following highly-expressed genes are not considered during normalization factor computation:
['MALAT1']
    finished (0:00:00)
In [28]:
sc.pp.log1p(scanpyObj)
In [29]:
scanpyObj.raw = scanpyObj

Subset according to JointHVGs and Filtered Barcodes from previous step¶

In [30]:
HVGs=pd.read_csv(outdir+"/HVG_list_intersection_Curated.txt", sep = "\t")["HVG"]

scanpyObj = scanpyObj[:,HVGs]
scanpyObj.var["highly_variable"] = True
In [31]:
#sc.pp.highly_variable_genes(scanpyObj, min_mean=0.0125, max_mean=5, min_disp=0.5)
In [32]:
#scanpyObj = scanpyObj[:, HVG.tolist()]

#Multiplexing = Multiplexing[:, Multiplexing.var.highly_variable]
In [33]:
#sc.pl.highly_variable_genes(scanpyObj)

Regress out effects of total counts per cell and the percentage of mitochondrial genes expressed. Scale the data to unit variance.

In [34]:
sc.pp.regress_out(scanpyObj, ['total_counts',"pct_counts_mt"])
regressing out ['total_counts', 'pct_counts_mt']
    sparse input is densified and may lead to high memory use
    finished (0:00:18)

Scale each gene to unit variance. Clip values exceeding standard deviation 10.

In [35]:
sc.pp.scale(scanpyObj, max_value=20)
In [36]:
scanpyObj.obs
Out[36]:
dataset cellID cellID_newName n_genes_by_counts log1p_n_genes_by_counts total_counts log1p_total_counts total_counts_mt log1p_total_counts_mt pct_counts_mt total_counts_ribo log1p_total_counts_ribo pct_counts_ribo
AAACCCAGTCTTACAG-1_UpD300 UpD300 MIFF1 CTL02A 4228 8.349721 13926.0 9.541585 764.0 6.639876 5.486141 2127.0 7.662938 15.273589
AAACCCAGTTCTCGCT-1_UpD300 UpD300 MIFF1 CTL02A 4274 8.360539 14950.0 9.612534 760.0 6.634634 5.083612 1969.0 7.585789 13.170568
AAACCCATCAACCTTT-1_UpD300 UpD300 MIFF1 CTL02A 3618 8.193953 11629.0 9.361343 738.0 6.605298 6.346203 1805.0 7.498870 15.521542
AAACCCATCCCGTGTT-1_UpD300 UpD300 MIFF1 CTL02A 1820 7.507141 3297.0 8.101071 112.0 4.727388 3.397028 152.0 5.030438 4.610251
AAACCCATCGCTAATG-1_UpD300 UpD300 MIFF1 CTL02A 2992 8.004032 9350.0 9.143239 623.0 6.436151 6.663102 1482.0 7.301822 15.850266
... ... ... ... ... ... ... ... ... ... ... ... ... ...
TTTGTTGAGCGATCGA-1_UpD300 UpD300 MIFF1 CTL02A 2548 7.843456 5465.0 8.606302 51.0 3.951244 0.933211 539.0 6.291569 9.862762
TTTGTTGAGCTGTACT-1_UpD300 UpD300 MIFF1 CTL02A 3514 8.164795 8823.0 9.085231 210.0 5.351858 2.380143 1118.0 7.020191 12.671428
TTTGTTGAGTATGACA-1_UpD300 UpD300 MIFF1 CTL02A 2850 7.955425 7763.0 8.957253 277.0 5.627621 3.568208 1417.0 7.257003 18.253252
TTTGTTGCATCAGTGT-1_UpD300 UpD300 MIFF1 CTL02A 3891 8.266678 13577.0 9.516206 305.0 5.723585 2.246446 2587.0 7.858641 19.054283
TTTGTTGTCTCGGCTT-1_UpD300 UpD300 MIFF1 CTL02A 4494 8.410721 14987.0 9.615005 640.0 6.463029 4.270368 1881.0 7.540091 12.550877

6240 rows × 13 columns

Principal component analysis¶

In [37]:
sc.tl.pca(scanpyObj, svd_solver='arpack')
computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:02)
In [38]:
sc.pl.pca(scanpyObj, color='MKI67')
In [39]:
sc.pl.pca_variance_ratio(scanpyObj, log=True)
In [40]:
scanpyObj
Out[40]:
AnnData object with n_obs × n_vars = 6240 × 3499
    obs: 'dataset', 'cellID', 'cellID_newName', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo'
    var: 'gene_ids', 'feature_types', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'ribo', 'highly_variable', 'mean', 'std'
    uns: 'cellID_colors', 'cellID_newName_colors', 'log1p', 'pca'
    obsm: 'X_pca'
    varm: 'PCs'

Computing the neighborhood graph¶

In [41]:
sc.pp.neighbors(scanpyObj, n_neighbors=10, n_pcs=9)
computing neighbors
    using 'X_pca' with n_pcs = 9
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:02)

Embedding the neighborhood graph¶

In [42]:
sc.tl.umap(scanpyObj)
scanpyObj
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:05)
Out[42]:
AnnData object with n_obs × n_vars = 6240 × 3499
    obs: 'dataset', 'cellID', 'cellID_newName', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo'
    var: 'gene_ids', 'feature_types', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'ribo', 'highly_variable', 'mean', 'std'
    uns: 'cellID_colors', 'cellID_newName_colors', 'log1p', 'pca', 'neighbors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'
In [43]:
sc.pl.umap(scanpyObj, color=['MKI67', 'S100B',"NES","PAX6","DCX",'STMN2',"DCN", "BGN"])
In [44]:
sc.tl.diffmap(scanpyObj)
computing Diffusion Maps using n_comps=15(=n_dcs)
computing transitions
    finished (0:00:00)
    eigenvalues of transition matrix
    [1.         0.9977711  0.9948467  0.9936784  0.99213886 0.988092
     0.98697853 0.98452944 0.9841668  0.9830491  0.9819561  0.9796113
     0.9766024  0.97365785 0.9727122 ]
    finished: added
    'X_diffmap', diffmap coordinates (adata.obsm)
    'diffmap_evals', eigenvalues of transition matrix (adata.uns) (0:00:00)
In [45]:
sc.tl.dpt(scanpyObj)
WARNING: No root cell found. To compute pseudotime, pass the index or expression vector of a root cell, one of:
    adata.uns['iroot'] = root_cell_index
    adata.var['xroot'] = adata[root_cell_name, :].X
computing Diffusion Pseudotime using n_dcs=10
    finished: added
 (0:00:00)
In [46]:
sc.pl.diffmap(scanpyObj, color=[ 'MKI67', 'S100B',"NES","PAX6","DCX",'STMN2',"DCN", "BGN"])
In [47]:
scanpyObj.X.max()
Out[47]:
20.0